remove(list = ls())

library(tidyverse)
library(randomForest)
library(lme4)
library(stargazer)
library(caret)
library(ggridges)
library(pdp)
library(DiagrammeR)

# Loading in data 
complete_set <- read.csv(path)

##########################
#### Making functions ####
##########################

# Random forest function
do_forest <- function(df) {
  forest <- randomForest(as.factor(number_rating) ~ ., data = df, 
                         ntree = 10000, mtry = round(sqrt(ncol(df) - 1)), sampsize = 1000, importance = TRUE)
  return(forest)
}

# Visualize forest function
visualize_forest <- function(forest_object) {
  model_vis <- as.data.frame(importance(forest_object))
  
  model_vis <- cbind(rownames(model_vis), model_vis)
  
  colnames(model_vis) <- c("variable", "1", "2", "3", "4", "accuracy", "gini")
  
  model_vis$name <- factor(model_vis$variable, levels = model_vis$variable[order(model_vis$accuracy)])
  
  
  model_vis %>%
    arrange(desc(accuracy)) %>%
    slice(1:10) %>%
    ggplot(aes(x = as.factor(name), y = accuracy))+
    geom_point(size = 3, alpha = .3)+
    geom_segment(aes(x=variable, 
                     xend=variable, 
                     y=0, 
                     yend=max(accuracy)), 
                 linetype="dashed", 
                 size=0.1)+
    theme_bw()+
    labs(title="Dot plot", 
         subtitle="Variables sorted by mean decrease in accuracy",
         x = "",
         y = "") +  
    coord_flip()
}

# Confusion matrix function
do_confusion <- function(model, train_set, valid_set) {
  pred_train <- predict(model, train_set, type = "class")
  
  table(pred_train, train_set$number_rating)
  
  pred_valid <- predict(model, valid_set, type = "class")
  
  confusion <- table(pred_valid, valid_set$number_rating)
  
  confuse_matrix <- confusionMatrix(confusion)
  return(confuse_matrix)
}

#########################
#### Making analyses ####
#########################
# Standardizing vars
library(MASS)
library(coefplot)

mean_center <- function(x) {
  new <- x - mean(x, na.rm = TRUE)
  new <- scale(new)
}

complete_set$satisfaction_std <- mean_center(complete_set$satisfaction)
complete_set$expect_std <- mean_center(complete_set$expected_point_score)
complete_set$absence_std <- mean_center(complete_set$absence_perc)
complete_set$writing_std <- mean_center(complete_set$prog_writ)
complete_set$math_std <- mean_center(complete_set$prog_math)
complete_set$read_std <- mean_center(complete_set$prog_read)
complete_set$disadprog_std <- mean_center(complete_set$pct_disad_exp_prog_read)



mod <- polr(as.factor(number_rating) ~  disadprog_std + writing_std 
            + math_std + read_std  + expect_std + absence_std +  satisfaction_std, data = complete_set)

coefplot(mod, coefficients = c("expect_std", "satisfaction_std", "absence_std",
                               "writing_std", "read_std", "math_std", "disadprog_std"))